#HPAEC-PAD monosaccharide analysis data processing

from scipy import stats
from Tkinter import Tk
from tkFileDialog import askopenfilename
from tkFileDialog import asksaveasfilename

#turn data into dictionary -> each monosaccharide can be called
def makeDictionary (readData): 
	rawDataDict = {}
	first = True
	for line in readData:
		word = line.split('\t') #splits list elements into lists
		if first: #takes the first row and turns them into keys in dictionary
			first = False
			global headings #make it global variable so it can be used in other functions
			headings = word[:] #headings and word are references to the same copy; [:] gives you a complete copy of a list
			for x in word:
				rawDataDict[x] = [] #blank list for each key
		else:
			for i in range(len(headings)):
				rawDataDict[headings[i]].append(word[i]) #make sure no extra line in a file, otherwise 'index out of range'
	return rawDataDict

#correct the data for the losses using internal standard (erythritol)	
def rawDataCorrections (rawDataDict):
	maximum = float(max(rawDataDict['Erythritol']))
	
	#make a correction factor list - the difference between highest erythritol read and sample read
	correctionFactors = []
	for x in rawDataDict['Erythritol']:
		x = float(x) #numbers are stored as strings, turn into float
		x = maximum/x
		correctionFactors.append(x)
		
	#make a list with sugar names w/o erythritol, sample names and tissue mass for looping through dictionary
	global sugarNames
	sugarNames = headings[2:-1]
	
	#multiply each sugar value with correction factor for given sample
	for x in sugarNames:
		for y in range(len(correctionFactors)):
			rawDataDict[x][y] = float(rawDataDict[x][y]) * correctionFactors[y] #turns strings into floats and multiplies
	return rawDataDict

#X-point calibration for each sugar using linear regression
def calibration (correctedDict, calibrationPoints):
	#make a new dictionary without standards and mass, only values for sugars and samples
	sugarData = {}
	for i in headings[:-1]:
		sugarData[i] = []
		
	#add slope and intercept for each sugar on top of the lists in a new dictionary
	sugarData['Sample'].append('slope')
	sugarData['Sample'].append('intercept')
	
	#x-axis: extract numbers (concentration) from the names for standards
	x = correctedDict['Sample'][:calibrationPoints]	
	for n in range(len(x)):	#string into float, remove 'mM' at the end
		a = x[n]
		b = a[:-2]
		x[n] = float(b)
		
	#y-axis: loop through sugars and extract HPAEC-PAD reads for each standard
	for i in sugarNames:	#loop through sugars
		y = []
		for j in range(calibrationPoints): #make temporary lists that will be used as x and y values for linear regression
			y.append(correctedDict[i][j])	#extract HPAEC-PAD reads for each standard
		regResult = stats.linregress(x,y)
		#first value (slope) and second(intercept) into the lists in the dictionary
		sugarData[i].append(regResult[0])
		sugarData[i].append(regResult[1])
		
	#add sample names to the 'Sample' key in the dictionary
	for smplName in correctedDict['Sample'][calibrationPoints:]:
		sugarData['Sample'].append(smplName)
		
	#calibrate: y = ax + b => conc = (signal-intercept)/slope to get actual concentrations for each sample
	for i in sugarNames:
		slope = sugarData[i][0]
		intercept = sugarData[i][1]
		for signal in correctedDict[i][calibrationPoints:]:
			conc = (signal - intercept)/slope
			sugarData[i].append(conc)
			
	#clean up: remove slope and intercept from the dictionary
	for i in sugarData:
		sugarData[i] = sugarData[i][2:]
	return sugarData

def finalCorrections (calibratedData, correctedData, calibrationPoints, volume):
	for i in sugarNames:
		n = calibrationPoints	#exclude standards (calibration points)
		if i == 'Ara':
			coefficient = 0.88
		if i == 'Xyl':
			coefficient = 0.88
		if i == 'Rha':
			coefficient = 0.896
		if i == 'Fuc':
			coefficient = 0.896
		if i == 'Gal':
			coefficient = 0.9
		if i == 'Glc':
			coefficient = 0.9
		if i == 'Man':
			coefficient = 0.9
		if i == 'GalA':
			coefficient = 0.907
		for j in range(len(calibratedData[i])):
			calibratedData[i][j] = calibratedData[i][j] * 0.001 * volume * coefficient * 1000000	#mmol/L into mmol/_mL (final volume of the solution), correct for polymer hydrolysis(lines 91-109), convert mmol to nmol
			calibratedData[i][j] = calibratedData[i][j] / float(correctedData['Mass(mg)'][n]) #nmol / mg of tissue
			n += 1
			
	return calibratedData

def organiseAndCalculate (finalCorrectedData):
	#find how many different samples there are (not replicates), and put their names as keys in a new dictionary
	sampleNames = {}
	for sample in finalCorrectedData['Sample']:
		sample = sample.split('#')	
		sample = sample[0]	#remove biological replicate identifier
		if sample not in sampleNames:
			sampleNames[sample] = []
	
	#find samples with the same names, and count
	for count, sample in enumerate(finalCorrectedData['Sample']):
		sample = sample.split('#')	
		sample = sample[0]
		for name in sampleNames:
			if name == sample:
				sampleNames[name].append(count)
	
	
	#use same count numbers to go though data for each sugar - calculate means and SE for each sample and put them into a new dictionary
	finalData = {}
	finalData['Sample'] = []
	for i in sugarNames:	#add sugar names as keys + one key for samples/SD/SE
		finalData[i] = []
	for genotype in sampleNames:
		finalData['Sample'].append(genotype)
	for sugar in sugarNames:
		for genotype in sampleNames:
			tempNumbers = []	#store numbers for mean and SE calculations here; have to loop through sample counts to get values
			for x in sampleNames[genotype]:
				tempNumbers.append(finalCorrectedData[sugar][x])
			finalData[sugar].append(tempNumbers)
	return finalData	
	
#============================================
	
Tk().withdraw()
filename = askopenfilename()
openData = open(filename)
readData = openData.read()
readData = readData.split('\n') #reads as string -> make a list that has lines with \t separators

#============================================
rawDataDict = makeDictionary (readData)
corrected = rawDataCorrections (rawDataDict)
calibrated = calibration(corrected, 4) #must indicate how many calibration points
finalCor = finalCorrections(calibrated, corrected, 4, 0.5) #indicate calibration points and final sample volume in mL by changing the last two numbers, respectively
finalDictionary = organiseAndCalculate(finalCor)

#output the final dictionary as txt file suitable for excel import
newfile = asksaveasfilename()
output = open(newfile, 'w')
output.write('Sugar' + '\t' + 'Genotype' + '\t' + 'nmol/mg' + '\n')
for key in finalDictionary:
	n=0
	for list in finalDictionary[key]:
		for number in finalDictionary[key][n]:
			if key == 'Sample':
				continue
			else:
				if n <= len(finalDictionary[key]):
					output.write(key + '\t' + finalDictionary['Sample'][n] + '\t' + str(number) + '\n')
		n+=1		